iiiD) Train/tune with resampling

Load package and data

1.1 Load package

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0     ✔ purrr   1.0.1
## ✔ tibble  3.1.8     ✔ dplyr   1.1.0
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(coefplot)
library(rstanarm)
## Loading required package: Rcpp
## This is rstanarm version 2.21.4
## - See https://mc-stan.org/rstanarm/articles/priors for changes to default priors!
## - Default priors may change, so it's safest to specify priors, even if equivalent to the defaults.
## - For execution on a local, multicore CPU with excess RAM we recommend calling
##   options(mc.cores = parallel::detectCores())
## 
## Attaching package: 'rstanarm'
## 
## The following object is masked from 'package:coefplot':
## 
##     invlogit
## 
## The following objects are masked from 'package:caret':
## 
##     compare_models, R2
library(splines)
library(kernlab)
## 
## Attaching package: 'kernlab'
## 
## The following object is masked from 'package:purrr':
## 
##     cross
## 
## The following object is masked from 'package:ggplot2':
## 
##     alpha

1.2 Load data

df_all <- readr::read_csv("paint_project_train_data.csv", col_names = TRUE)
## Rows: 835 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Lightness, Saturation
## dbl (6): R, G, B, Hue, response, outcome
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- df_all %>%
  subset(select = c(R, G, B, Hue)) %>% 
  scale() %>% as.data.frame() %>%
  bind_cols(df_all %>% subset(select = c(Lightness, Saturation, outcome)))

dfiii <- df %>% 
  mutate(outcome = ifelse(outcome == 1, 'event', 'non_event'),
         outcome = factor(outcome, levels = c('event', 'non_event')))

dfiii %>% glimpse()
## Rows: 835
## Columns: 7
## $ R          <dbl> -0.19790120, -2.74419521, -0.19790120, -2.70931447, -0.2327…
## $ G          <dbl> -2.3619736, -1.7631189, -1.6433480, -1.7830807, -2.2022790,…
## $ B          <dbl> -1.7994266, -0.1706872, -1.8726283, -0.1523868, -1.8726283,…
## $ Hue        <dbl> -1.3548215, 1.3198239, -0.9585777, 1.4188848, -1.2557605, -…
## $ Lightness  <chr> "dark", "dark", "dark", "dark", "dark", "dark", "dark", "da…
## $ Saturation <chr> "bright", "bright", "bright", "bright", "bright", "bright",…
## $ outcome    <fct> event, event, event, non_event, non_event, non_event, non_e…

2. ROC

2.1 Train parameter

my_ctrl <- trainControl(method = "repeatedcv",
                        number = 5,
                        repeats = 3,
                        summaryFunction = twoClassSummary,
                        classProbs = TRUE,
                        savePredictions = TRUE)

my_metric <- "ROC"

2.2 Build models

All categorical and continuous inputs - linear additive features

set.seed(1234)
roc_glm_add <- train(outcome ~ .,
                  data = dfiii,
                  method = "glm",
                  metric = my_metric,
                  preProcess = c("center", "scale"),
                  trControl = my_ctrl)

roc_glm_add
## Generalized Linear Model 
## 
## 835 samples
##   6 predictor
##   2 classes: 'event', 'non_event' 
## 
## Pre-processing: centered (16), scaled (16) 
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 668, 669, 667, 668, 668, 668, ... 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.7856635  0.3529465  0.9596294

Add categorical inputs to all main effect and all pairwise interactions of continuous inputs

set.seed(1234)
roc_glm_pair <- train(outcome ~ (.)^2 + Lightness + Saturation,
                  data = dfiii,
                  method = "glm",
                  metric = my_metric,
                  preProcess = c("center", "scale"),
                  trControl = my_ctrl)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
roc_glm_pair
## Generalized Linear Model 
## 
## 835 samples
##   6 predictor
##   2 classes: 'event', 'non_event' 
## 
## Pre-processing: centered (106), scaled (106) 
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 668, 669, 667, 668, 668, 668, ... 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.8063522  0.5131354  0.8783915

The 2 models selected from iiiA)

set.seed(1234)
roc_glm_mod6 <- train(outcome ~ (R + G + B + Hue)^2 + (Saturation + Lightness),
                  data = dfiii,
                  method = "glm",
                  metric = my_metric,
                  preProcess = c("center", "scale"),
                  trControl = my_ctrl)

roc_glm_mod6
## Generalized Linear Model 
## 
## 835 samples
##   6 predictor
##   2 classes: 'event', 'non_event' 
## 
## Pre-processing: centered (22), scaled (22) 
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 668, 669, 667, 668, 668, 668, ... 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.8354789  0.4174989  0.9275638
roc_glm_mod8 <- train(outcome ~ (I(R^2) + I(G^2) + I(B^2) + I(Hue^2)) + (Saturation + Lightness),
                  data = dfiii,
                  method = "glm",
                  metric = my_metric,
                  preProcess = c("center", "scale"),
                  trControl = my_ctrl)

roc_glm_mod8
## Generalized Linear Model 
## 
## 835 samples
##   6 predictor
##   2 classes: 'event', 'non_event' 
## 
## Pre-processing: centered (16), scaled (16) 
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 668, 668, 668, 668, 668, 668, ... 
## Resampling results:
## 
##   ROC        Sens      Spec     
##   0.8069767  0.377148  0.9502947

2.3 Regularized regression with Elastic net

set.seed(1234)
roc_glm_enet_pair_warmup <- train(outcome ~ (.)^2 + Lightness + Saturation,
                  data = dfiii,
                  method = "glmnet",
                  metric = my_metric,
                  preProcess = c("center", "scale"),
                  trControl = my_ctrl)

roc_glm_enet_pair_warmup
## glmnet 
## 
## 835 samples
##   6 predictor
##   2 classes: 'event', 'non_event' 
## 
## Pre-processing: centered (106), scaled (106) 
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 668, 669, 667, 668, 668, 668, ... 
## Resampling results across tuning parameters:
## 
##   alpha  lambda        ROC        Sens       Spec     
##   0.10   0.0003523518  0.8306778  0.5183086  0.8975371
##   0.10   0.0035235183  0.8189651  0.4450292  0.9192668
##   0.10   0.0352351830  0.7842403  0.3178587  0.9523861
##   0.55   0.0003523518  0.8315981  0.5201080  0.8970163
##   0.55   0.0035235183  0.8121631  0.4119208  0.9270268
##   0.55   0.0352351830  0.7663396  0.3249213  0.9684270
##   1.00   0.0003523518  0.8320552  0.5183986  0.8939115
##   1.00   0.0035235183  0.8064952  0.3823212  0.9368702
##   1.00   0.0352351830  0.7465383  0.3301395  0.9689438
## 
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 1 and lambda = 0.0003523518.

Tune

tune_grid_enet_roc <- expand.grid(.alpha = seq(0, 1, length.out = 5),  
                         .lambda = exp(
                        seq(log(min(roc_glm_enet_pair_warmup$results$lambda)),
                          log(max(roc_glm_enet_pair_warmup$results$lambda)),
                          length.out = 25)))

set.seed(1234)
roc_glm_enet_pair_tuned <- train(outcome ~ (.)^2 + Lightness + Saturation,
                  data = dfiii,
                  method = "glmnet",
                  metric = my_metric,
                  preProcess = c("center", "scale"),
                  tuneGrid = tune_grid_enet_roc,
                  trControl = my_ctrl)

plot(roc_glm_enet_pair_tuned, xTrans = log)

roc_glm_enet_pair_besttune <- roc_glm_enet_pair_tuned$bestTune
roc_glm_enet_pair_besttune
##     alpha      lambda
## 102     1 0.000426884
roc_glm_enet_pair_roc <- roc_glm_enet_pair_tuned$results[
  roc_glm_enet_pair_tuned$results$alpha ==
    roc_glm_enet_pair_tuned$bestTune$alpha & 
  roc_glm_enet_pair_tuned$results$lambda ==
    roc_glm_enet_pair_tuned$bestTune$lambda,
  "ROC"]
roc_glm_enet_pair_roc
## [1] 0.8322752

In the roc_glm_enet_pair tuned model, best alpha is 0.25 and lambda is 0.0004. In this condition, the ROC is 0.82.

set.seed(1234)
roc_glm_enet_mod9_warmup <- train(outcome ~ (I(R^2) + I(G^2) + I(B^2) + I(Hue^2) + R * G * B * Hue) * (Saturation + Lightness),
                  data = dfiii,
                  method = "glmnet",
                  metric = my_metric,
                  preProcess = c("center", "scale"),
                  trControl = my_ctrl)

plot(roc_glm_enet_mod9_warmup, xTrans = log)

Tune

tune_grid_enet_mod9_roc <- expand.grid(.alpha = seq(0, 1, length.out = 5),  
                         .lambda = exp(
                        seq(log(min(roc_glm_enet_mod9_warmup$results$lambda)),
                          log(max(roc_glm_enet_mod9_warmup$results$lambda)),
                          length.out = 25)))

set.seed(1234)
roc_glm_enet_mod9_tune <- train(outcome ~ (I(R^2) + I(G^2) + I(B^2) + I(Hue^2) + R * G * B * Hue) * (Saturation + Lightness),
                  data = dfiii,
                  method = "glmnet",
                  metric = my_metric,
                  preProcess = c("center", "scale"),
                  tuneGrid = tune_grid_enet_mod9_roc,
                  trControl = my_ctrl)

plot(roc_glm_enet_mod9_tune, xTrans = log)

roc_glm_enet_mod9_besttune <- roc_glm_enet_mod9_tune$bestTune
roc_glm_enet_mod9_besttune
##     alpha      lambda
## 113     1 0.003523518
roc_glm_enet_mod9_roc <- roc_glm_enet_mod9_tune$results[
  roc_glm_enet_mod9_tune$results$alpha ==
    roc_glm_enet_mod9_tune$bestTune$alpha & 
  roc_glm_enet_mod9_tune$results$lambda ==
    roc_glm_enet_mod9_tune$bestTune$lambda,
  "ROC"]
roc_glm_enet_mod9_roc
## [1] 0.8535447

In the roc_glm_enet_pair tuned model, best alpha is 1 and lambda is 0.0035. In this condition, the ROC is 0.85.

set.seed(1234)
roc_glm_enet_mod8_warmup <- train(outcome ~ (I(R^2) + I(G^2) + I(B^2) + I(Hue^2)) + (Saturation + Lightness),
                  data = dfiii,
                  method = "glmnet",
                  metric = my_metric,
                  preProcess = c("center", "scale"),
                  trControl = my_ctrl)

plot(roc_glm_enet_mod8_warmup, xTrans = log)

Tune

tune_grid_enet_mod8_roc <- expand.grid(.alpha = seq(0, 1, length.out = 5),  
                         .lambda = exp(
                        seq(log(min(roc_glm_enet_mod8_warmup$results$lambda)),
                          log(max(roc_glm_enet_mod8_warmup$results$lambda)),
                          length.out = 25)))

set.seed(1234)
roc_glm_enet_mod8_tune <- train(outcome ~ (I(R^2) + I(G^2) + I(B^2) + I(Hue^2) + R * G * B * Hue) * (Saturation + Lightness),
                  data = dfiii,
                  method = "glmnet",
                  metric = my_metric,
                  preProcess = c("center", "scale"),
                  tuneGrid = tune_grid_enet_mod8_roc,
                  trControl = my_ctrl)

plot(roc_glm_enet_mod8_tune, xTrans = log)

roc_glm_enet_mod8_besttune <- roc_glm_enet_mod8_tune$bestTune
roc_glm_enet_mod8_besttune
##     alpha      lambda
## 113     1 0.003523518
roc_glm_enet_mod8_roc <- roc_glm_enet_mod8_tune$results[
  roc_glm_enet_mod8_tune$results$alpha ==
    roc_glm_enet_mod8_tune$bestTune$alpha & 
  roc_glm_enet_mod8_tune$results$lambda ==
    roc_glm_enet_mod8_tune$bestTune$lambda,
  "ROC"]
roc_glm_enet_mod8_roc
## [1] 0.8535447

In the roc_glm_enet_pair tuned model, best alpha is 1 and lambda is 0.0035. In this condition, the ROC is 0.85.

2.4 Neural network

set.seed(1234)
roc_nnet_warmup <- train(outcome ~ .,
                  data = dfiii,
                  method = "nnet",
                  metric = my_metric,
                  preProcess = c("center", "scale"),
                  trControl = my_ctrl,
                  trace = FALSE)
plot(roc_nnet_warmup, xTrans = log)

Tune

tune_grid_neural <- expand.grid(size = c(1:5, 10),
                                decay = c(0, 0.05, 0.1, 1, 2))

roc_nnet_tune <- train(outcome ~ .,
                    data = dfiii,
                    method = "nnet",
                    metric = my_metric,
                    tuneGrid = tune_grid_neural,
                    preProcess = c("center", "scale"),
                    trControl = my_ctrl,
                    trace = FALSE)

plot(roc_nnet_tune, xTrans = log)

roc_nnet_tune_besttune <- roc_nnet_tune$bestTune
roc_nnet_tune_besttune
##    size decay
## 13    3   0.1
roc_nnet_besttune <- roc_nnet_tune$bestTune
roc_nnet_roc <- roc_nnet_tune$results[
  roc_nnet_tune$results$size == roc_nnet_tune$bestTune$size & 
  roc_nnet_tune$results$decay == roc_nnet_tune$bestTune$decay,
  "ROC"]
roc_nnet_roc
## [1] 0.8581684

For Neural network model, the best size is 3 and the decay is 0.1.In this condition, the rmse is 0.85.

2.5 Random forest

set.seed(1234)
roc_rf_warmup <- train(outcome ~ .,
                  data = dfiii,
                  method = "rf",
                  metric = my_metric,
                  preProcess = c("center", "scale"),
                  trControl = my_ctrl,
                  trace = FALSE)
plot(roc_rf_warmup, xTrans = log)

Tune

set.seed(1234)
roc_rf_tune <- train(outcome ~ .,
                  data = dfiii,
                  method = "rf",
                  metric = my_metric,
                  trControl = my_ctrl,
                  tuneGrid = expand.grid(mtry = seq(2, 8, by = 1)),
                  importance = TRUE)
plot(roc_rf_tune, xTrans = log)

2.6 Gradient boosted tree

set.seed(1234)
roc_gbm_warmup <- train(outcome ~ .,
                      data = dfiii,
                      method = "gbm",
                      metric = my_metric,
                      trControl = my_ctrl,
                      verbose = FALSE)
plot(roc_gbm_warmup, xTrans = log)

Tune

gbm_grid <- expand.grid(n.trees = c(100, 150, 300, 500, 750, 1000),
                        shrinkage = c(0.01, 0.1),
                        interaction.depth = roc_gbm_warmup$bestTune$interaction.depth,
                        n.minobsinnode = roc_gbm_warmup$bestTune$n.minobsinnode)
set.seed(1234)
roc_gbm_tune <- train(outcome ~ .,
                      data = dfiii,
                      method = "gbm",
                      metric = my_metric,
                      tuneGrid = gbm_grid,
                      trControl = my_ctrl,
                      verbose=FALSE)
plot(roc_gbm_tune, xTrans = log)

2.7 SVM

set.seed(1234)
roc_svm_warmup <- train(outcome ~ .,
                 data = dfiii,
                 method = "svmRadial",
                 metric = my_metric,
                 preProcess = c("center", "scale"),
                 trControl = my_ctrl)

plot(roc_svm_warmup)

Tune

svm_grid <- expand.grid(
  .C = 2^seq(-5, 5, by = 1),
  .sigma = 2^seq(-15, -5, by = 1)
)


set.seed(1234)
roc_svm_tuned <- train(outcome ~ .,
                 data = dfiii,
                 method = "svmRadial",
                 metric = my_metric,
                 tuneGrid = svm_grid,
                 preProcess = c("center", "scale"),
                 trControl = my_ctrl)
## maximum number of iterations reached 0.006803206 -0.006749595maximum number of iterations reached 0.008883975 -0.0086964maximum number of iterations reached 0.002827103 -0.002898284maximum number of iterations reached 2.629518e-05 -2.631537e-05maximum number of iterations reached 0.008601321 -0.008439173maximum number of iterations reached 0.00316154 -0.003249339maximum number of iterations reached 2.271904e-05 -2.272969e-05maximum number of iterations reached 0.003298857 -0.003387905maximum number of iterations reached 4.161542e-05 -4.166808e-05maximum number of iterations reached 2.023221e-05 -2.02412e-05maximum number of iterations reached 0.007547289 -0.007467774maximum number of iterations reached 0.008760768 -0.00867095maximum number of iterations reached 0.002389034 -0.002482381maximum number of iterations reached 0.008500135 -0.008415446maximum number of iterations reached 0.002356658 -0.002443247maximum number of iterations reached 1.190669e-05 -1.191186e-05maximum number of iterations reached 0.002359268 -0.002452468maximum number of iterations reached 9.218427e-06 -9.220375e-06maximum number of iterations reached 2.579084e-05 -2.58221e-05maximum number of iterations reached 0.00742806 -0.007356151maximum number of iterations reached 0.008861739 -0.008751837maximum number of iterations reached 0.002514656 -0.002596214maximum number of iterations reached 9.365318e-06 -9.365492e-06maximum number of iterations reached 0.009273782 -0.009128191maximum number of iterations reached 0.002385768 -0.002457711maximum number of iterations reached 0.0028075 -0.002904203maximum number of iterations reached 9.992463e-06 -9.992932e-06maximum number of iterations reached 0.006865783 -0.006808732maximum number of iterations reached 0.00844391 -0.008311954maximum number of iterations reached 0.003285983 -0.003397471maximum number of iterations reached 1.3988e-05 -1.399243e-05maximum number of iterations reached 0.008917699 -0.008762911maximum number of iterations reached 0.002923346 -0.003016591maximum number of iterations reached 2.072846e-05 -2.074173e-05maximum number of iterations reached 0.002758083 -0.002831798maximum number of iterations reached 1.957621e-05 -1.958761e-05maximum number of iterations reached 2.690519e-05 -2.692885e-05maximum number of iterations reached 0.008262198 -0.008161896maximum number of iterations reached 0.00885438 -0.008771156maximum number of iterations reached 0.001809866 -0.001864107maximum number of iterations reached 0.008889571 -0.008807336maximum number of iterations reached 0.002299905 -0.002378792maximum number of iterations reached 0.002080324 -0.002150144maximum number of iterations reached 0.007764216 -0.00768031maximum number of iterations reached 0.0089361 -0.00881104maximum number of iterations reached 0.002743466 -0.002848237maximum number of iterations reached 9.741658e-06 -9.742511e-06maximum number of iterations reached 0.009112058 -0.008982366maximum number of iterations reached 0.002319627 -0.002394795maximum number of iterations reached 9.597567e-06 -9.598684e-06maximum number of iterations reached 0.002327806 -0.002403018maximum number of iterations reached 9.343197e-06 -9.344213e-06maximum number of iterations reached 1.206216e-05 -1.206574e-05maximum number of iterations reached 0.008452883 -0.008340261maximum number of iterations reached 0.008901759 -0.008875535maximum number of iterations reached 0.002108803 -0.002189419maximum number of iterations reached 0.008788357 -0.008754788maximum number of iterations reached 0.002037905 -0.002118792maximum number of iterations reached 0.001918232 -0.001991176maximum number of iterations reached 0.006619289 -0.006570177maximum number of iterations reached 0.008871418 -0.008691176maximum number of iterations reached 0.002989465 -0.003061961maximum number of iterations reached 2.137888e-05 -2.139203e-05maximum number of iterations reached 0.008842904 -0.008665567maximum number of iterations reached 0.003224748 -0.003319416maximum number of iterations reached 3.047094e-05 -3.049931e-05maximum number of iterations reached 0.003427416 -0.003521007maximum number of iterations reached 3.547519e-05 -3.551605e-05maximum number of iterations reached 2.819237e-05 -2.821547e-05maximum number of iterations reached 0.006385595 -0.006339744maximum number of iterations reached 0.008807341 -0.008637217maximum number of iterations reached 0.003395168 -0.003484759maximum number of iterations reached 2.033957e-05 -2.034958e-05maximum number of iterations reached 0.00863832 -0.008483818maximum number of iterations reached 0.002972961 -0.00305533maximum number of iterations reached 2.140562e-05 -2.14187e-05maximum number of iterations reached 0.002809367 -0.002883116maximum number of iterations reached 3.605426e-05 -3.609485e-05maximum number of iterations reached 3.212602e-05 -3.215943e-05maximum number of iterations reached 0.006830154 -0.006771939maximum number of iterations reached 0.008456254 -0.00836044maximum number of iterations reached 0.002747732 -0.002835985maximum number of iterations reached 9.35224e-06 -9.352943e-06maximum number of iterations reached 0.008997933 -0.008852414maximum number of iterations reached 0.002353767 -0.002426406maximum number of iterations reached 1.287805e-05 -1.288171e-05maximum number of iterations reached 0.002609408 -0.002696477maximum number of iterations reached 3.299908e-05 -3.304028e-05maximum number of iterations reached 1.307418e-05 -1.307801e-05maximum number of iterations reached 0.006256759 -0.00621472maximum number of iterations reached 0.008762905 -0.008574634maximum number of iterations reached 0.003735803 -0.003821754maximum number of iterations reached 3.683779e-05 -3.687972e-05maximum number of iterations reached 0.008174076 -0.008022271maximum number of iterations reached 0.003435722 -0.003524095maximum number of iterations reached 6.260091e-05 -6.272237e-05maximum number of iterations reached 0.003717061 -0.003807392maximum number of iterations reached 4.692277e-05 -4.699032e-05maximum number of iterations reached 3.338384e-05 -3.341127e-05maximum number of iterations reached 0.006696397 -0.006646518maximum number of iterations reached 0.009111809 -0.008917537maximum number of iterations reached 0.003518967 -0.003611675maximum number of iterations reached 3.393554e-05 -3.396948e-05maximum number of iterations reached 0.008342849 -0.008188105maximum number of iterations reached 0.00283729 -0.002901497maximum number of iterations reached 4.783611e-05 -4.791368e-05maximum number of iterations reached 0.003358165 -0.003445215maximum number of iterations reached 3.196415e-05 -3.199591e-05maximum number of iterations reached 5.034584e-05 -5.043427e-05maximum number of iterations reached 0.006337222 -0.006289426maximum number of iterations reached 0.008833625 -0.008668051maximum number of iterations reached 0.003160341 -0.003249474maximum number of iterations reached 1.683193e-05 -1.683741e-05maximum number of iterations reached 0.008930259 -0.008756452maximum number of iterations reached 0.0031709 -0.003268572maximum number of iterations reached 2.173465e-05 -2.174734e-05maximum number of iterations reached 0.003240555 -0.003347279maximum number of iterations reached 2.364326e-05 -2.365981e-05maximum number of iterations reached 3.511544e-05 -3.515604e-05maximum number of iterations reached 0.008433343 -0.008326377maximum number of iterations reached 0.008943314 -0.008885949maximum number of iterations reached 0.001978087 -0.002053607maximum number of iterations reached 0.008973671 -0.008924695maximum number of iterations reached 0.002087266 -0.002167601maximum number of iterations reached 0.002143859 -0.002223415maximum number of iterations reached 0.009464055 -0.009314208maximum number of iterations reached 0.008970714 -0.009004539maximum number of iterations reached 0.001787732 -0.00185606maximum number of iterations reached 0.008912999 -0.008945559maximum number of iterations reached 0.001610061 -0.001663547maximum number of iterations reached 0.001659524 -0.001718823
plot(roc_svm_tuned, xTrans = log)

2.8 PLS

set.seed(1234)
roc_pls_warmup <- train(outcome ~ .,
                      data = dfiii,
                      method = "pls",
                      metric = my_metric,
                      preProcess = c("center", "scale"),
                      trControl = my_ctrl)
plot(roc_pls_warmup, xTrans = log)

Tune

set.seed(1234)
PLS_tuneGrid <- expand.grid(
  .ncomp = seq(1, 10, by = 1)
)

roc_pls_tune <- train(outcome ~ .,
                      data = dfiii,
                      method = "pls",
                      metric = my_metric,
                      tuneGrid = PLS_tuneGrid,
                      preProcess = c("center", "scale"),
                      trControl = my_ctrl)
plot(roc_pls_tune, xTrans = log)

3. ROC Model Evaluation

3.1 Identify the best model.

ROC_perform <- resamples(list(GLM_ADD = roc_glm_add,
                                 GLM_PAIR = roc_glm_pair,
                                 GLM_MOD6 = roc_glm_mod6,
                                 GLM_MOD8 = roc_glm_mod8,
                                 ENET_PAIR_WARMUP = roc_glm_enet_pair_warmup,
                                 ENET_PAIR_TUNE = roc_glm_enet_pair_tuned,
                                 ENET_MOD8_WARMUP = roc_glm_enet_mod8_warmup,
                                 ENET_MOD8_TUNE = roc_glm_enet_mod8_tune,
                                 ENET_MOD9_WARMUP = roc_glm_enet_mod9_warmup,
                                 ENET_MOD9_TUNE = roc_glm_enet_mod9_tune,
                                 GBM_WARMUP = roc_gbm_warmup,
                                 GBM_TUNE = roc_gbm_tune,
                                 NNET_WARMUP = roc_nnet_warmup,
                                 NNET_TUNE = roc_nnet_tune,
                                 RF_WARMUP = roc_rf_warmup,
                                 RF_TUNE = roc_rf_tune,
                                 SVM_WARMUP = roc_svm_warmup,
                                 SVM_TUNE = roc_svm_tuned,
                                 PLS_TUNE = roc_pls_tune,
                                 PLS_WARWUP = roc_pls_warmup
                                 )
                            )

4. Accuracy

4.1 Train parameter

my_ctrl_acc <- trainControl(method = "repeatedcv", number = 5, repeats = 3)
my_metric_acc <- "Accuracy"

4.2 Build Model

All categorical and continuous inputs - linear additive features

set.seed(1234)
acc_glm_add <- train(outcome ~ .,
                  data = dfiii,
                  method = "glm",
                  metric = my_metric_acc,
                  preProcess = c("center", "scale"),
                  trControl = my_ctrl_acc)

acc_glm_add
## Generalized Linear Model 
## 
## 835 samples
##   6 predictor
##   2 classes: 'event', 'non_event' 
## 
## Pre-processing: centered (16), scaled (16) 
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 668, 669, 667, 668, 668, 668, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8208036  0.3784784

Add categorical inputs to all main effect and all pairwise interactions of continuous inputs

set.seed(1234)
acc_glm_pair <- train(outcome ~ (.)^2 + Lightness + Saturation,
                  data = dfiii,
                  method = "glm",
                  metric = my_metric_acc,
                  preProcess = c("center", "scale"),
                  trControl = my_ctrl_acc)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
acc_glm_pair
## Generalized Linear Model 
## 
## 835 samples
##   6 predictor
##   2 classes: 'event', 'non_event' 
## 
## Pre-processing: centered (106), scaled (106) 
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 668, 669, 667, 668, 668, 668, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.7948291  0.4026757

The 2 models selected from iiiA)

set.seed(1234)
acc_glm_mod4 <- train(outcome ~ (R + G + B + Hue)^2 + (Saturation + Lightness),
                  data = dfiii,
                  method = "glm",
                  metric = my_metric_acc,
                  preProcess = c("center", "scale"),
                  trControl = my_ctrl_acc)

acc_glm_mod4
## Generalized Linear Model 
## 
## 835 samples
##   6 predictor
##   2 classes: 'event', 'non_event' 
## 
## Pre-processing: centered (22), scaled (22) 
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 668, 669, 667, 668, 668, 668, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8108164  0.3896845
acc_glm_mod8 <- train(outcome ~ (I(R^2) + I(G^2) + I(B^2) + I(Hue^2)) + (Saturation + Lightness),
                  data = dfiii,
                  method = "glm",
                  metric = my_metric_acc,
                  preProcess = c("center", "scale"),
                  trControl = my_ctrl_acc)

acc_glm_mod8
## Generalized Linear Model 
## 
## 835 samples
##   6 predictor
##   2 classes: 'event', 'non_event' 
## 
## Pre-processing: centered (16), scaled (16) 
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 668, 668, 668, 668, 668, 668, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8191823  0.3895074

4.3 Regularized regression with Elastic net - ACC

set.seed(1234)
acc_glm_enet_pair_warmup <- train(outcome ~ (.)^2 + Lightness + Saturation,
                  data = dfiii,
                  method = "glmnet",
                  metric = my_metric_acc,
                  preProcess = c("center", "scale"),
                  trControl = my_ctrl_acc)

plot(acc_glm_enet_pair_warmup, xTrans = log)

Tune

tune_grid_enet_pair <- expand.grid(.alpha = seq(0, 1, length.out = 5),  
                         .lambda = exp(
                          seq(
                            log(min(acc_glm_enet_pair_warmup$results$lambda)),
                          log(max(acc_glm_enet_pair_warmup$results$lambda)),
                          length.out = 25)))

set.seed(1234)
acc_glm_enet_pair_tuned <- train(outcome ~ (.)^2 + Lightness + Saturation,
                  data = dfiii,
                  method = "glmnet",
                  metric = my_metric_acc,
                  tuneGrid = tune_grid_enet_pair,
                  preProcess = c("center", "scale"),
                  trControl = my_ctrl_acc)

plot(acc_glm_enet_pair_tuned, xTrans = log)

set.seed(1234)
acc_glm_enet_mod9_warmup <- train(outcome ~ (I(R^2) + I(G^2) + I(B^2) + I(Hue^2) + R * G * B * Hue) * (Saturation + Lightness),
                  data = dfiii,
                  method = "glmnet",
                  metric = my_metric_acc,
                  preProcess = c("center", "scale"),
                  trControl = my_ctrl_acc)
plot(acc_glm_enet_mod9_warmup, xTrans = log)

Tune

tune_grid_enet_mod9 <- expand.grid(.alpha = seq(0, 1, length.out = 5),  
                         .lambda = exp(
                          seq(
                            log(min(acc_glm_enet_mod9_warmup$results$lambda)),
                          log(max(acc_glm_enet_mod9_warmup$results$lambda)),
                          length.out = 25)))

set.seed(1234)
acc_glm_enet_mod9_tuned <- train(outcome ~ (I(R^2) + I(G^2) + I(B^2) + I(Hue^2) + R * G * B * Hue) * (Saturation + Lightness),
                  data = dfiii,
                  method = "glmnet",
                  metric = my_metric_acc,
                  tuneGrid = tune_grid_enet_mod9,
                  preProcess = c("center", "scale"),
                  trControl = my_ctrl_acc)
plot(acc_glm_enet_mod9_tuned, xTrans = log)

set.seed(1234)
acc_glm_enet_mod8_warmup <- train(outcome ~ (I(R^2) + I(G^2) + I(B^2) + I(Hue^2)) + (Saturation + Lightness),
                  data = dfiii,
                  method = "glmnet",
                  metric = my_metric_acc,
                  preProcess = c("center", "scale"),
                  trControl = my_ctrl_acc)

acc_glm_enet_mod8_warmup
## glmnet 
## 
## 835 samples
##   6 predictor
##   2 classes: 'event', 'non_event' 
## 
## Pre-processing: centered (16), scaled (16) 
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 668, 669, 667, 668, 668, 668, ... 
## Resampling results across tuning parameters:
## 
##   alpha  lambda        Accuracy   Kappa    
##   0.10   0.0003523518  0.8224053  0.3984576
##   0.10   0.0035235183  0.8208013  0.3877722
##   0.10   0.0352351830  0.8224029  0.3765199
##   0.55   0.0003523518  0.8224053  0.3985818
##   0.55   0.0035235183  0.8207989  0.3867972
##   0.55   0.0352351830  0.8227878  0.3714550
##   1.00   0.0003523518  0.8208061  0.3923758
##   1.00   0.0035235183  0.8211981  0.3867218
##   1.00   0.0352351830  0.8227878  0.3714550
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 0.55 and lambda = 0.03523518.

Tune

tune_grid_enet_mod8 <- expand.grid(.alpha = seq(0, 1, length.out = 5),  
                         .lambda = exp(
                          seq(
                            log(min(acc_glm_enet_mod8_warmup$results$lambda)),
                          log(max(acc_glm_enet_mod8_warmup$results$lambda)),
                          length.out = 25)))

set.seed(1234)
acc_glm_enet_mod8_tuned <- train(outcome ~ (I(R^2) + I(G^2) + I(B^2) + I(Hue^2)) + (Saturation + Lightness),
                  data = dfiii,
                  method = "glmnet",
                  metric = my_metric_acc,
                  tuneGrid = tune_grid_enet_mod8,
                  preProcess = c("center", "scale"),
                  trControl = my_ctrl_acc)

plot(acc_glm_enet_mod8_tuned, xTrans = log)

4.4 Neural network

set.seed(1234)
acc_nnet_warmup <- train(outcome ~ .,
                  data = dfiii,
                  method = "nnet",
                  metric = my_metric_acc,
                  preProcess = c("center", "scale"),
                  trControl = my_ctrl_acc,
                  trace = FALSE)
plot(acc_nnet_warmup, xTrans = log)

Tune

tune_grid_neural <- expand.grid(size = c(5, 10, 20),
                                decay = c(0, 0.05, 0.1, 1, 2))

acc_nnet_tune <- train(outcome ~ .,
                    data = dfiii,
                    method = "nnet",
                    metric = my_metric_acc,
                    tuneGrid = tune_grid_neural,
                    preProcess = c("center", "scale"),
                    trControl = my_ctrl_acc,
                    trace = FALSE)

plot(acc_nnet_tune, xTrans = log)

4.5 Random forest - ACC

set.seed(1234)
acc_rf_warmup <- train(outcome ~ .,
                  data = dfiii,
                  method = "rf",
                  metric = my_metric_acc,
                  preProcess = c("center", "scale"),
                  trControl = my_ctrl_acc,
                  trace = FALSE)
plot(acc_rf_warmup, xTrans = log)

Tune

set.seed(1234)
acc_rf_tune <- train(outcome ~ .,
                  data = dfiii,
                  method = "rf",
                  metric = my_metric_acc,
                  trControl = my_ctrl_acc,
                  tuneGrid = expand.grid(mtry = seq(2, 8, by = 1)),
                  importance = TRUE)
plot(acc_rf_tune, xTrans = log)

4.6 Gradient boosted tree - ACC

set.seed(1234)
acc_gbm_warmup <- train(outcome ~ .,
                      data = dfiii,
                      method = "gbm",
                      metric = my_metric_acc,
                      trControl = my_ctrl_acc,
                      verbose = FALSE)

plot(acc_gbm_warmup, xTrans = log)

Tune

acc_gbm_grid <- expand.grid(n.trees = c(100, 150, 300, 500, 750, 1000),
                        shrinkage = c(0.01, 0.1),
                        interaction.depth = acc_gbm_warmup$bestTune$interaction.depth,
                        n.minobsinnode = acc_gbm_warmup$bestTune$n.minobsinnode)

set.seed(1234)
acc_gbm_tune <- train(outcome ~ .,
                      data = dfiii,
                      method = "gbm",
                      metric = my_metric_acc,
                      tuneGrid = acc_gbm_grid,
                      trControl = my_ctrl_acc,
                      verbose=FALSE)

plot(acc_gbm_tune, xTrans = log)

2.7 SVM - ACC

set.seed(1234)
acc_svm_warmup <- train(outcome ~ .,
                 data = dfiii,
                 method = "svmRadial",
                 metric = my_metric_acc,
                 preProcess = c("center", "scale"),
                 trControl = my_ctrl_acc)

plot(acc_svm_warmup, xTrans = log)

Tune

set.seed(1234)
acc_svm_tuned <- train(outcome ~ .,
                 data = dfiii,
                 method = "svmRadial",
                 metric = my_metric_acc,
                 tuneGrid = svm_grid,
                 preProcess = c("center", "scale"),
                 trControl = my_ctrl_acc)
plot(acc_svm_tuned, xTrans = log)

2.8 PLS

set.seed(1234)
acc_pls_warmup <- train(outcome ~ .,
                      data = dfiii,
                      method = "pls",
                      metric = my_metric_acc,
                      preProcess = c("center", "scale"),
                      trControl = my_ctrl_acc)

plot(acc_pls_warmup, xTrans = log)

Tune

set.seed(1234)
acc_pls_tune <- train(outcome ~ .,
                      data = dfiii,
                      method = "pls",
                      metric = my_metric_acc,
                      preProcess = c("center", "scale"),
                      trControl = my_ctrl_acc)

plot(acc_pls_tune)

5. ACC Model Evaluation

3.1 Identify the best model.

ACC_perform <- resamples(list(GLM_ADD = acc_glm_add,
                                 GLM_PAIR = acc_glm_pair,
                                 GLM_MOD4 = acc_glm_mod4,
                                 ENET_PAIR_WARMUP = acc_glm_enet_pair_warmup,
                                 ENET_PAIR_TUNE = acc_glm_enet_pair_tuned,
                                 ENET_MOD8_WARMUP = acc_glm_enet_mod8_warmup,
                                 ENET_MOD8_TUNE = acc_glm_enet_mod8_tuned,
                                 ENET_MOD9_WARMUP = acc_glm_enet_mod9_warmup,
                                 ENET_MOD9_TUNE = acc_glm_enet_mod9_tuned,
                                 GBM_WARMUP = acc_gbm_warmup,
                                 GBM_TUNE = acc_gbm_tune,
                                 NNET_WARMUP = acc_nnet_warmup,
                                 NNET_TUNE = acc_nnet_tune,
                                 RF_WARMUP = acc_rf_warmup,
                                 RF_TUNE = acc_rf_tune,
                                 SVM_WARMUP = acc_svm_warmup,
                                 SVM_TUNE = acc_svm_tuned,
                                 PLS_WARMUP = acc_pls_warmup,
                                 PLS_TUNE = acc_pls_tune
                                 )
                            )

6. Compare ROC and Accurancy

dotplot(ROC_perform, metric = "ROC")

dotplot(ACC_perform, metric = "Accuracy")

Which model is the best if you are interested in maximizing Accuracy compared to maximizing the Area Under the ROC Curve (ROC AUC)? In Accuracy, Random Forest mod is the best, while in ROC curve, GBM tuned model is the best.

acc_gbm_tune %>% readr::write_rds('Model/acc_gbm_tune.rds')